/*
 ============================================================================
 Name        : texttodna.c
 Author      : M.Barsky
 Description : Removing from files in the directory characters except a,c,g,t
 (with mapping- texttodna with mapping to actual positions)
 ============================================================================
 */

#include <stdio.h>
#include <stdlib.h>

#define MAX_PATH_LENGTH 200

#define MAX_SEQ_NAME_LENGTH 100


#define MIN(a, b) ((a)<=(b) ? (a) : (b))


typedef struct FileInfo
{
	char fileName[MAX_PATH_LENGTH];
	int length;
	char stringName[MAX_SEQ_NAME_LENGTH];
}FileInfo;

typedef struct FileData
{
	char FileName[MAX_PATH_LENGTH];
	int fileSize;
	int lengthToIndex;
	int startInMergedFile;
	int startInOriginalFile;
}FileData;

int minActualLength;


int convertToDNAAlphabet( char *inputBuffer,int rawLength,
		char *outputBuffer, int *actualLength)
{
	
	int counter=0;
	int i;	
		
	for(i=0;i<rawLength;i++)
	{
		char curr=inputBuffer[i];
		switch (curr)
		{
		case 'a': case 'A':				
				outputBuffer[counter++]='a'; 
							
				break;
		case 'c': case 'C':				
				outputBuffer[counter++]='c'; 
							
				break;
		case 'g': case 'G':				
				outputBuffer[counter++]='g'; 
								
				break;
		case 't': case 'T':
				outputBuffer[counter++]='t';				
				break;			
		default:				
				break;
		}
		
	}	
	*actualLength=counter;
	return 0;
}	

int convertToDNAAlphabetWithPositionMapping(char *inputBuffer,int rawLength,
		char *outputBuffer, int *actualLength, char *positionsMappingFileName)
{
	int i;
	int counter=0;
	FILE *outputFile;
	int positionsMappingBuf[1];
	int result;

	if(!(outputFile= fopen ( positionsMappingFileName , "wb" )))
	{
		printf("Could not open output %s file for writing raw positions \n",positionsMappingFileName);
		return 1;
	}	
		
	for(i=0;i<rawLength;i++)
	{
		char curr=inputBuffer[i];
		switch (curr)
		{
		case 'a': case 'A':				
				outputBuffer[counter]='a'; 
				
				positionsMappingBuf[0]=i;
				result=fwrite(positionsMappingBuf, sizeof(int), 1, outputFile);	
				if(result!=1)
				{
					printf("Error writing mapping positions\n");
					exit(1);
				}
				counter++;
				break;
		case 'c': case 'C':				
				outputBuffer[counter]='c'; 
				
				positionsMappingBuf[0]=i;
				result=fwrite(positionsMappingBuf, sizeof(int), 1, outputFile);	
				if(result!=1)
				{
					printf("Error writing mapping positions\n");
					exit(1);
				}
				counter++;			
				break;
		case 'g': case 'G':				
				outputBuffer[counter]='g'; 
				
				positionsMappingBuf[0]=i;
				result=fwrite(positionsMappingBuf, sizeof(int), 1, outputFile);	
				if(result!=1)
				{
					printf("Error writing mapping positions\n");
					exit(1);
				}
				counter++;				
				break;
		case 't': case 'T':
				outputBuffer[counter]='t';
				
				positionsMappingBuf[0]=i;
				result=fwrite(positionsMappingBuf, sizeof(int), 1, outputFile);	
				if(result!=1)
				{
					printf("Error writing mapping positions\n");
					exit(1);
				}
				counter++;
				break;			
		default:				
				break;
		}
		
	}	
	*actualLength=counter;
	fclose(outputFile);	
	return 0;
}

int textToDNA (char *textFileName, char *DNAFileName, char *inputBuffer, char *outputBuffer)
{
	FILE *outputFile;
	FILE *inputFile;
	int filesize;
	
	int result;
	int validLength=0;

	if(!(outputFile= fopen ( DNAFileName , "wb" )))
	{
		printf("Could not open output DNA %s file for writing \n",DNAFileName);
		return 1;
	}	
	
	if(!(inputFile= fopen ( textFileName , "rb" )))
	{
		printf("Could not open input text file %s for reading \n",textFileName);
		return 1;
	}	
	

	fseek (inputFile, 0, SEEK_END);
	
	filesize=ftell (inputFile);
	rewind(inputFile);
	
	
	result = fread (inputBuffer,sizeof(char),filesize,inputFile);
	if(result!=filesize)
	{
		printf("error reading data from file %s \n",textFileName);
		return 1;
	}	
	
	
	if(convertToDNAAlphabet(inputBuffer,filesize, outputBuffer, &validLength))
		return 1;
	if(validLength<minActualLength)
		minActualLength=validLength;
	result=fwrite(outputBuffer, sizeof(char), validLength, outputFile);	
	if(result!=validLength)
	{
		printf("not all data from file %s was written\n",textFileName);
		return 1;
	}	
	printf("written %d out of %d chars\n",validLength,filesize);
	fclose(outputFile);
	fclose(inputFile);	

	return 0;
}

int textToDNAWithPositionsMapping (char *mappingFileName, char *textFileName, 
								   char *DNAFileName, char *inputBuffer, char *outputBuffer)
{
	FILE *outputFile;
	FILE *inputFile;
	
	
	int filesize;
	int validLength=0;
	int result;

	
	if(!(outputFile= fopen ( DNAFileName , "wb" )))
	{
		printf("Could not open output DNA %s file for writing \n",DNAFileName);
		return 1;
	}	
	
	if(!(inputFile= fopen ( textFileName , "rb" )))
	{
		printf("Could not open input text file %s for reading \n",textFileName);
		return 1;
	}	
	
	fseek (inputFile, 0, SEEK_END);
	filesize=ftell (inputFile);
	rewind(inputFile);
	

	result = fread (inputBuffer,sizeof(char),filesize,inputFile);
	if(result!=filesize)
	{
		printf("error reading data from file %s \n",textFileName);
		return 1;
	}	
	
	
	if(convertToDNAAlphabetWithPositionMapping(inputBuffer,filesize, outputBuffer, 
						&validLength,mappingFileName))
		return 1;
	if(validLength<minActualLength)
		minActualLength=validLength;

	result=fwrite(outputBuffer, sizeof(char), validLength, outputFile);	
	if(result!=validLength)
	{
		printf("not all data from file %s was written\n",textFileName);
		return 1;
	}	
	printf("written %d out of %d chars into dna file %s\n",validLength,filesize,DNAFileName);
	fclose(outputFile);
	fclose(inputFile);

	
	
	return 0;
}



int main(int argc, char *argv[]) 
{
	char inputtextfilename[MAX_PATH_LENGTH];
	char inputtextfileprefix[MAX_PATH_LENGTH];
	char outputdnafilename[MAX_PATH_LENGTH];
	char outputdnafileprefix[MAX_PATH_LENGTH];
	char mappingprefix[MAX_PATH_LENGTH];
	char mappingFileName[MAX_PATH_LENGTH];
	char infofilename[MAX_PATH_LENGTH];
	int minSubscript;
	int maxSubscript;
	FILE *inputFile;
	
	int maxFileSize=0;
	//int filesize;

//	char rawFileName[MAX_PATH_LENGTH];
//	int *buffer;
	
	int withMapping;
//	int totalPositions;
//	int result;
	int i;
	char * inputBuffer;
	char * outputBuffer;
	int info[3];

//	FILE *reader;

	if(argc<6)
	{
		printf("To run: ./texttodna <textfilesdir> <textfilesprefix> <outputdir> <outputprefix <withmapping 0/1> \n");
		return 1;
	}
	

	sprintf(inputtextfileprefix,"%s%s",argv[1],argv[2]);
	
	sprintf(outputdnafileprefix,"%s%s",argv[3],argv[4]);
	sprintf(mappingprefix,"%s%s",argv[3],argv[4]);
	withMapping=atoi(argv[7]);
	sprintf(infofilename,"%s%s_input_info", argv[3],argv[4]);


	//1. read info to compute min substript,max subscript and maxfile size
	if(!(inputFile= fopen ( infofilename , "rb" )))
	{
		printf("Could not open input info file %s for reading \n",infofilename);
		return 1;
	}
	
	
	
	if(fread(info, sizeof(int), 3, inputFile)!=3)
	{
		printf("Error reading input info \n");
		return 1;
	}
	fclose(inputFile);

	minSubscript=0;
	maxSubscript=info[0]-1;;
	maxFileSize=info[2];
	minActualLength=info[1];	
	
	inputBuffer=(char*) calloc (maxFileSize, sizeof(char));
	outputBuffer=(char*) calloc (maxFileSize, sizeof(char));
	
	
	
	for(i=minSubscript;i<=maxSubscript;i++)
	{

		sprintf(inputtextfilename,"%s_%d.txt",inputtextfileprefix,i);
		sprintf(outputdnafilename,"%s_%d",outputdnafileprefix,i);
		
		sprintf(mappingFileName,"%s_%d_positionsmapping",mappingprefix,i);
		if(withMapping)
		{
			if(textToDNAWithPositionsMapping(mappingFileName,inputtextfilename,outputdnafilename,inputBuffer,outputBuffer))
				return 1;
		}
		else
		{
			if(textToDNA(inputtextfilename,outputdnafilename,inputBuffer,outputBuffer))
				return 1;
		}

		/*if(withMapping)
		{
			
			sprintf(rawFileName,"%s_pos_mapping",mappingprefix);


			if(!(reader= fopen ( rawFileName , "rb" )))
			{
				printf("Could not open input cleaned positions file %s for reading \n",rawFileName);
				return 1;
			}	
			fseek (reader, 0, SEEK_END);
			totalPositions=ftell (reader)/sizeof(int);
			rewind(reader);

			buffer=(int*) calloc (totalPositions, sizeof(int));
			
			result = fread (buffer,sizeof(int),totalPositions,reader);
			if(result!=totalPositions)
			{
				printf("error reading data from positions file %s \n",rawFileName);
				return 1;
			}
			fclose(reader);		

			for(i=0;i<totalPositions;i++)
			{
				printf("position %i in cleaned corresponds to position %i in raw\n",i,buffer[i]);
			}
		}*/
	}
//write updated info
if(!(inputFile= fopen ( infofilename , "wb" )))
	{
		printf("Could not open input info file %s for writing updated info \n",infofilename);
		return 1;
	}
	
	info[1]=minActualLength;
	
	if(fwrite(info, sizeof(int), 3, inputFile)!=3)
	{
		printf("Error writing input info \n");
		return 1;
	}
	fclose(inputFile);
	printf("Now min file length is %d+n",minActualLength);
	return 0;
}
